In [1]:
import numpy as np
import pandas as pd
import json
In [2]:
loans = pd.read_csv('lending-club-data.csv')
loans.head(2)
Out[2]:
In [3]:
# safe_loans = 1 => safe
# safe_loans = -1 => risky
loans['safe_loans'] = loans['bad_loans'].apply(lambda x : +1 if x==0 else -1)
#loans = loans.remove_column('bad_loans')
loans = loans.drop('bad_loans', axis=1)
In [4]:
features = ['grade', # grade of the loan
'term', # the term of the loan
'home_ownership', # home_ownership status: own, mortgage or rent
'emp_length', # number of years of employment
]
target = 'safe_loans'
In [5]:
loans = loans[features + [target]]
In [6]:
loans.iloc[122602]
Out[6]:
In [7]:
categorical_variables = []
for feat_name, feat_type in zip(loans.columns, loans.dtypes):
if feat_type == object:
categorical_variables.append(feat_name)
for feature in categorical_variables:
loans_one_hot_encoded = pd.get_dummies(loans[feature],prefix=feature)
#print loans_one_hot_encoded
loans = loans.drop(feature, axis=1)
for col in loans_one_hot_encoded.columns:
loans[col] = loans_one_hot_encoded[col]
print loans.head(2)
print loans.columns
In [8]:
loans.iloc[122602]
Out[8]:
In [9]:
with open('module-5-assignment-2-train-idx.json') as train_data_file:
train_idx = json.load(train_data_file)
with open('module-5-assignment-2-test-idx.json') as test_data_file:
test_idx = json.load(test_data_file)
print train_idx[:3]
print test_idx[:3]
In [10]:
print len(train_idx)
print len(test_idx)
In [11]:
train_data = loans.iloc[train_idx]
test_data = loans.iloc[test_idx]
In [12]:
print len(loans.dtypes )
Recall from the lecture that prediction at an intermediate node works by predicting the majority class for all data points that belong to this node. Now, we will write a function that calculates the number of misclassified examples when predicting the majority class. This will be used to help determine which feature is the best to split on at a given node of the tree.
Note: Keep in mind that in order to compute the number of mistakes for a majority classifier, we only need the label (y values) of the data points in the node.
Steps to follow:
In [13]:
def intermediate_node_num_mistakes(labels_in_node):
# Corner case: If labels_in_node is empty, return 0
if len(labels_in_node) == 0:
return 0
# Count the number of 1's (safe loans)
## YOUR CODE HERE
safe_loan = (labels_in_node==1).sum()
# Count the number of -1's (risky loans)
## YOUR CODE HERE
risky_loan = (labels_in_node==-1).sum()
# Return the number of mistakes that the majority classifier makes.
## YOUR CODE HERE
return min(safe_loan, risky_loan)
In [14]:
# Test case 1
example_labels = np.array([-1, -1, 1, 1, 1])
if intermediate_node_num_mistakes(example_labels) == 2:
print 'Test passed!'
else:
print 'Test 1 failed... try again!'
# Test case 2
example_labels = np.array([-1, -1, 1, 1, 1, 1, 1])
if intermediate_node_num_mistakes(example_labels) == 2:
print 'Test passed!'
else:
print 'Test 3 failed... try again!'
# Test case 3
example_labels = np.array([-1, -1, -1, -1, -1, 1, 1])
if intermediate_node_num_mistakes(example_labels) == 2:
print 'Test passed!'
else:
print 'Test 3 failed... try again!'
The function best_splitting_feature takes 3 arguments:
The function will loop through the list of possible features, and consider splitting on each of them. It will calculate the classification error of each split and return the feature that had the smallest classification error when split on.
Recall that the classification error is defined as follows:
Note: Remember that since we are only dealing with binary features, we do not have to consider thresholds for real-valued features. This makes the implementation of this function much easier.
Your code should be analogous to
In [15]:
def best_splitting_feature(data, features, target):
target_values = data[target]
best_feature = None # Keep track of the best feature
best_error = 10 # Keep track of the best error so far
# Note: Since error is always <= 1, we should intialize it with something larger than 1.
# Convert to float to make sure error gets computed correctly.
num_data_points = float(len(data))
# Loop through each feature to consider splitting on that feature
for feature in features:
# The left split will have all data points where the feature value is 0
left_split = data[data[feature] == 0]
# The right split will have all data points where the feature value is 1
## YOUR CODE HERE
right_split = data[data[feature] == 1]
# Calculate the number of misclassified examples in the left split.
# Remember that we implemented a function for this! (It was called intermediate_node_num_mistakes)
# YOUR CODE HERE
left_mistakes = intermediate_node_num_mistakes(left_split[target])
# Calculate the number of misclassified examples in the right split.
## YOUR CODE HERE
right_mistakes = intermediate_node_num_mistakes(right_split[target])
# Compute the classification error of this split.
# Error = (# of mistakes (left) + # of mistakes (right)) / (# of data points)
## YOUR CODE HERE
error = (left_mistakes + right_mistakes) / num_data_points
# If this is the best error we have found so far, store the feature as best_feature and the error as best_error
## YOUR CODE HERE
if error < best_error:
best_feature = feature
best_error = error
return best_feature # Return the best feature we found
In [18]:
def create_leaf(target_values):
# Create a leaf node
leaf = {'splitting_feature' : None,
'left' : None,
'right' : None,
'is_leaf': True } ## YOUR CODE HERE
# Count the number of data points that are +1 and -1 in this node.
num_ones = len(target_values[target_values == +1])
num_minus_ones = len(target_values[target_values == -1])
# For the leaf node, set the prediction to be the majority class.
# Store the predicted class (1 or -1) in leaf['prediction']
if num_ones > num_minus_ones:
leaf['prediction'] = 1 ## YOUR CODE HERE
else:
leaf['prediction'] = -1 ## YOUR CODE HERE
# Return the leaf node
return leaf
In [34]:
def decision_tree_create(data, features, target, current_depth = 0, max_depth = 10):
remaining_features = features[:] # Make a copy of the features.
target_values = data[target]
print "--------------------------------------------------------------------"
print "Subtree, depth = %s (%s data points)." % (current_depth, len(target_values))
# Stopping condition 1
# (Check if there are mistakes at current node.
# Recall you wrote a function intermediate_node_num_mistakes to compute this.)
if intermediate_node_num_mistakes(target_values) == 0: ## YOUR CODE HERE
print "Stopping condition 1 reached."
# If not mistakes at current node, make current node a leaf node
return create_leaf(target_values)
# Stopping condition 2 (check if there are remaining features to consider splitting on)
if remaining_features == []: ## YOUR CODE HERE
print "Stopping condition 2 reached."
# If there are no remaining features to consider, make current node a leaf node
return create_leaf(target_values)
# Additional stopping condition (limit tree depth)
if current_depth >= max_depth: ## YOUR CODE HERE
print "Reached maximum depth. Stopping for now."
# If the max tree depth has been reached, make current node a leaf node
return create_leaf(target_values)
# Find the best splitting feature (recall the function best_splitting_feature implemented above)
## YOUR CODE HERE
splitting_feature = best_splitting_feature(data, remaining_features, target)
# Split on the best feature that we found.
left_split = data[data[splitting_feature] == 0]
right_split = data[data[splitting_feature] == 1] ## YOUR CODE HERE
remaining_features.remove(splitting_feature)
print "Split on feature %s. (%s, %s)" % (\
splitting_feature, len(left_split), len(right_split))
# Create a leaf node if the split is "perfect"
if len(left_split) == len(data):
print "Creating leaf node."
return create_leaf(left_split[target])
if len(right_split) == len(data):
print "Creating leaf node."
## YOUR CODE HERE
return create_leaf(right_split[target])
# Repeat (recurse) on left and right subtrees
left_tree = decision_tree_create(left_split, remaining_features, target, current_depth + 1, max_depth)
## YOUR CODE HERE
right_tree = decision_tree_create(right_split, remaining_features, target, current_depth + 1, max_depth)
return {'is_leaf' : False,
'prediction' : None,
'splitting_feature': splitting_feature,
'left' : left_tree,
'right' : right_tree}
In [23]:
input_features = train_data.columns
print list(input_features)
In [32]:
a = list(train_data.columns)
a.remove('safe_loans')
print a
print list(train_data.columns)
In [35]:
my_decision_tree = decision_tree_create(train_data, a, 'safe_loans', current_depth = 0, max_depth = 6)
In [39]:
def classify(tree, x, annotate = False):
# if the node is a leaf node.
if tree['is_leaf']:
if annotate:
print "At leaf, predicting %s" % tree['prediction']
return tree['prediction']
else:
# split on feature.
split_feature_value = x[tree['splitting_feature']]
if annotate:
print "Split on %s = %s" % (tree['splitting_feature'], split_feature_value)
if split_feature_value == 0:
return classify(tree['left'], x, annotate)
else:
### YOUR CODE HERE
return classify(tree['right'], x, annotate)
In [42]:
print test_data.iloc[0]
print 'Predicted class: %s ' % classify(my_decision_tree, test_data.iloc[0])
In [43]:
classify(my_decision_tree, test_data.iloc[0], annotate=True)
Out[43]:
This function should return a prediction (class label) for each row in data using the decision tree. Your code should be analogous to
In [69]:
def evaluate_classification_error(tree, data):
# Apply the classify(tree, x) to each row in your data
prediction = data.apply(lambda x: classify(tree, x), axis=1)
# Once you've made the predictions, calculate the classification error and return it
## YOUR CODE HERE
return (data['safe_loans'] != np.array(prediction)).values.sum() *1. / len(data)
In [70]:
evaluate_classification_error(my_decision_tree, test_data)
Out[70]:
In [76]:
def print_stump(tree, name = 'root'):
split_name = tree['splitting_feature'] # split_name is something like 'term. 36 months'
if split_name is None:
print "(leaf, label: %s)" % tree['prediction']
return None
split_feature, split_value = split_name.split('_',1)
print ' %s' % name
print ' |---------------|----------------|'
print ' | |'
print ' | |'
print ' | |'
print ' [{0} == 0] [{0} == 1] '.format(split_name)
print ' | |'
print ' | |'
print ' | |'
print ' (%s) (%s)' \
% (('leaf, label: ' + str(tree['left']['prediction']) if tree['left']['is_leaf'] else 'subtree'),
('leaf, label: ' + str(tree['right']['prediction']) if tree['right']['is_leaf'] else 'subtree'))
In [77]:
print_stump(my_decision_tree)
In [78]:
print_stump(my_decision_tree['left'], my_decision_tree['splitting_feature'])
In [79]:
print_stump(my_decision_tree['left']['left'], my_decision_tree['left']['splitting_feature'])
In [81]:
print_stump(my_decision_tree['right'], my_decision_tree['splitting_feature'])
In [82]:
print_stump(my_decision_tree['right']['right'], my_decision_tree['right']['splitting_feature'])
In [ ]: